## Modules for data loading
import os
import copy
import netCDF4 as nc
import pandas as pd
import datetime
## Modules for statistical analysis & wrangling
import numpy as np
from scipy import stats
import astropy.convolution.convolve as convolve
from scipy.signal import convolve2d
import spreg
import statsmodels.tsa.seasonal
## Modules for clustering
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from kneed import KneeLocator
## Modules for mapping & plotting
import cartopy as cart
import cartopy.crs as ccrs
import cartopy.io.shapereader
from shapely import geometry
from shapely.ops import unary_union
from shapely.prepared import prep
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
# Gauss circle function
def gauss_circle(r=5):
arr = np.zeros((2 * r + 1, 2 * r + 1), dtype=int)
for row in range(arr.shape[0]):
for col in range(arr.shape[1]):
if np.sqrt(np.square(row - r) + np.square(col - r)) <= r:
arr[row, col] = 1
return(arr)
# ConvGistar function
def ConvGistar(input_array, kernel=gauss_circle()):
ConvGistar = np.empty(input_array.shape)
Xbar = np.nanmean(input_array, axis=(1, 2))
n = input_array.shape[1]*input_array.shape[2]
for i in range(input_array.shape[0]):
ConvGistar[i] = (convolve(input_array[i], kernel, boundary='fill', fill_value=np.nan, preserve_nan=True, normalize_kernel=False) - Xbar[i]*np.nansum(kernel)) / (np.sqrt(convolve(np.square(input_array[i]), kernel, boundary='fill', fill_value=np.nan, preserve_nan=True, normalize_kernel=False) - np.square(Xbar[i])) * np.sqrt((n*np.nansum(np.square(kernel)) - np.square(np.nansum(kernel))) / (n - 1)))
ConvGistar[np.isnan(input_array) == True] = np.nan
return ConvGistar
# is_land function
def is_land(land, lon, lat):
return land.contains(geometry.Point(lon, lat))
Load TROPOMI dataset: NO2 VCD
## Load TROPOMI dataset (netCDF)
no2_nc_filename = "./dataset/cube_no2_shipping_RedSea.nc"
## Read netCDF file
ds = nc.Dataset(no2_nc_filename, 'r')
## Store dimension information (time, latitude, and longitude)
dim_names = list(ds.dimensions) # get variable keys
time, lats, lons = ds.variables[dim_names[0]][:], ds.variables[dim_names[1]][:], ds.variables[dim_names[2]][:]
print("Dimensions of netCDF NO2 data:")
for dim in ds.dimensions.items():
print(dim)
## Store NO2 column measurements from netCDF dataset
product = ds['/PRODUCT']
no2_vcd_attr = list(product.variables.items())
no2_vcd_name = no2_vcd_attr[0][0]
no2_vcd = product.variables[no2_vcd_name][:len(time)] # vertical column
## Close netCDF file
ds.close()
## Crop the dataset and dimension w.r.t. the region of interest
## If interested in a smaller ROI, change lats_roi = (0, lats_roi), and lons_roi = (0, lons_roi) where lats_roi <= lats.size and lons_roi <= lons.size
lats_roi = (0, lats.size)
lons_roi = (0, lons.size)
lats, lons = lats[lats_roi[0]:lats_roi[1]], lons[lons_roi[0]:lons_roi[1]]
no2_vcd = no2_vcd[:, lats_roi[0]:lats_roi[1], lons_roi[0]:lons_roi[1]]
extent = [min(lons), max(lons), min(lats), max(lats)] # coordinate extent of given netCDF dataset
## only run when Indian Ocean
# no2_scd[330, :, 160:] = np.nan ## Part of Indian Ocean dataset on 29 October 2019 should be masked due to large noise
del dim_names, dim, ds, product, no2_vcd_attr, no2_vcd_name
Dimensions of netCDF NO2 data:
('Time', <class 'netCDF4._netCDF4.Dimension'> (unlimited): name = 'Time', size = 1072)
('Latitude', <class 'netCDF4._netCDF4.Dimension'>: name = 'Latitude', size = 320)
('Longitude', <class 'netCDF4._netCDF4.Dimension'>: name = 'Longitude', size = 320)
Load shipping frequency dataset
## Load shipping frequency dataset (npy)
ship_freq_npy_filename = "./dataset/ship_freq_RedSea.npy" # Using ship track count data from Halpern, B. et al. (2015), regridded on the same WGS84 coordinate
ship_freq = np.load(ship_freq_npy_filename)[::-1, :][lats_roi[0]:lats_roi[1], lons_roi[0]:lons_roi[1]]
ship_freq = ship_freq/np.nanmax(ship_freq) # normalization; range of ship track data: [0, max. of ship_freq)
del ship_freq_npy_filename
Filter out the land coordinates from NO2 VCD
# Build ocean_mask
x, y = np.array(lons), np.array(lats)
xx, yy = np.meshgrid(x, y, sparse = True)
ocean_mask = np.empty([len(lats), len(lons)])
land = prep(unary_union(list(cartopy.io.shapereader.Reader(cartopy.io.shapereader.natural_earth(resolution='10m', category='physical', name='land')).geometries())))
for i in range(len(lons)):
for j in range(len(lats)):
ocean_mask[j, i] = not is_land(land, x[i], y[j])
# Replace land coordinate with NaN
ocean_mask[ocean_mask==0] = np.nan
print("Check on ocean_mask with NaN (1. if ocean, np.nan if land):\n", ocean_mask)
no2_vcd = np.multiply(np.array(copy.deepcopy(no2_vcd)[:, :, :].data), ocean_mask)
Check on ocean_mask with NaN (1. if ocean, np.nan if land): [[ 1. 1. 1. ... nan nan nan] [ 1. 1. 1. ... nan nan nan] [ 1. 1. 1. ... nan nan nan] ... [nan nan nan ... 1. 1. 1.] [nan nan nan ... 1. 1. 1.] [nan nan nan ... 1. 1. 1.]]
Adjust plotting parameters
## Customize parameters for plotting
## Parameters for publication (high resolution)
plt.style.use('ggplot')
plt.rcParams['font.size'] = 13.5
plt.rcParams['xtick.labelsize'], plt.rcParams['ytick.labelsize'] = 13.5, 13.5
plt.rcParams['text.color'] = 'black'
plt.rcParams['axes.labelcolor'] = 'black'
plt.rcParams['xtick.color'] = 'black'
plt.rcParams['ytick.color'] = 'black'
fig_size = [abs(extent[1] - extent[0]), abs(extent[3] - extent[2])]
Plot base map
## Plot base map
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cart.feature.LAND, zorder=1, facecolor='#edc9af', alpha=1)
ax.add_feature(cart.feature.COASTLINE, linewidth=1, edgecolor='black')
ax.gridlines(draw_labels=True, linestyle='--')
ax.set_extent([extent[0] - 1, extent[1] + 1, extent[2] - 1, extent[3] + 1])
## Add a rectangle patch and an annotation
ax.add_patch(matplotlib.patches.Rectangle((extent[0], extent[2]), fig_size[0], fig_size[1], linewidth=1, edgecolor='r', facecolor='none'))
ax.annotate('Region of Interest', xy=(extent[0], extent[3]), xytext=(extent[0]*1.05, extent[3]*0.90),
arrowprops=dict(facecolor='red', shrink=0.05))
plt.show()
Plot ocean coordinates only
## Plot base map
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cart.feature.LAND, zorder=1, facecolor='#edc9af', alpha=1)
ax.add_feature(cart.feature.COASTLINE, linewidth=1, edgecolor='black')
ax.gridlines(draw_labels=True, linestyle='--')
## Plot ocean-coordinates only
plt.imshow(ocean_mask, extent=[extent[0], extent[1], extent[2], extent[3]], cmap='viridis')
plt.show()
Plot true shipping frequency
## Plot base map
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cart.feature.LAND, zorder=1, facecolor='#edc9af', alpha=1)
ax.add_feature(cart.feature.COASTLINE, linewidth=1, edgecolor='black')
ax.gridlines(draw_labels=True, linestyle='--')
## Plot true shipping frequency
plt.imshow(ship_freq, extent=[extent[0], extent[1], extent[2], extent[3]], cmap='viridis')
plt.colorbar(matplotlib.cm.ScalarMappable(cmap='viridis'),
fraction=0.0426, pad = 0.175, extend='neither', ax=plt.gca(),
ticks=np.linspace(np.floor(np.nanmin(ship_freq) * 100) / 100, np.ceil(np.nanmax(ship_freq) * 100) / 100, 11),
label='Normalized ship track count').outline.set_edgecolor(None)
plt.show()
Plot daily measurement data
## Plot base map
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cart.feature.LAND, zorder=1, facecolor='#edc9af', alpha=1)
ax.add_feature(cart.feature.COASTLINE, linewidth=1, edgecolor='black')
ax.gridlines(draw_labels=True, linestyle='--')
## Plot daily measurement data
plt.imshow(copy.deepcopy(no2_vcd)[888, :, :], # May 11th 2021
cmap='viridis', extent=[extent[0], extent[1], extent[2], extent[3]], vmin=0, vmax=0.00005)
plt.colorbar(cmap='viridis', fraction=0.0426, pad=0.175, extend='both',
label='NO$_{2}$ vertical column (mol m$^{-2}$)').outline.set_edgecolor(None)
plt.show()
Plot average measurement data over entire period
## Plot base map
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cart.feature.LAND, zorder=1, facecolor='#edc9af', alpha=1)
ax.add_feature(cart.feature.COASTLINE, linewidth=1, edgecolor='black')
ax.gridlines(draw_labels=True, linestyle='--')
## Plot average measurement data over entire period
plt.imshow(np.nanmean(copy.deepcopy(no2_vcd)[:, :, :], axis = 0),
cmap='viridis', extent=[extent[0], extent[1], extent[2], extent[3]], vmin=0, vmax=0.00005)
plt.colorbar(cmap='viridis', fraction=0.0426, pad=0.175, extend='both',
label='NO$_{2}$ vertical column (mol m$^{-2}$)').outline.set_edgecolor(None)
plt.show()
Perform statistical analysis
## Prepare data
NO2 = np.nanmean(copy.deepcopy(no2_vcd)[:, :, :], axis=0).reshape(-1, 1)
SHIP_TRACK = copy.deepcopy(ship_freq).reshape(-1, 1)
mask = ~np.isnan(NO2) & ~np.isnan(SHIP_TRACK)
X = NO2[mask].reshape(-1, 1)
y = SHIP_TRACK[mask]
## Pearson correlation
print('Spatial correlation between NO2 column level & ship track counts:', stats.pearsonr(NO2[mask], SHIP_TRACK[mask]))
# Fit OLS model
m1 = spreg.OLS(
# Dependent variable
y,
# Independent variables
X,
# Dependent variable name
name_y='Ship track',
# Independent variable name
name_x=['Original NO2']
)
print(m1.summary)
Spatial correlation between NO2 column level & ship track counts: PearsonRResult(statistic=0.1508017057311495, pvalue=3.0986167591790774e-138)
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : unknown
Weights matrix : None
Dependent Variable : Ship track Number of Observations: 27231
Mean dependent var : 0.1240 Number of Variables : 2
S.D. dependent var : 0.1704 Degrees of Freedom : 27229
R-squared : 0.0227
Adjusted R-squared : 0.0227
Sum squared residual: 772.356 F-statistic : 633.6283
Sigma-square : 0.028 Prob(F-statistic) : 3.099e-138
S.E. of regression : 0.168 Log likelihood : 9868.366
Sigma-square ML : 0.028 Akaike info criterion : -19732.731
S.E of regression ML: 0.1684 Schwarz criterion : -19716.307
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 0.0234761 0.0041228 5.6942356 0.0000000
Original NO2 6965.7221079 276.7252891 25.1719752 0.0000000
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 7.953
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 67802.441 0.0000
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 1 2333.547 0.0000
Koenker-Bassett test 1 573.868 0.0000
================================ END OF REPORT =====================================
Plot scatter plot of original NO2 data
## Scatter plot of NO2 data
fig_boxplot = sns.scatterplot(x=NO2[mask], y=SHIP_TRACK[mask], alpha=1, color='#440154', linewidth=0.1, s=10)
min_before_outlier_removal, max_before_outlier_removal = np.nanmin(NO2[mask]), np.nanmax(NO2[mask])
fig_boxplot.set(xlabel='NO$_2$ vertical column (mol m$^{-2}$)',
ylabel='Normalized ship track count',
ylim = (-0.02, 1.02),
xlim = (0.99 * min_before_outlier_removal, 1.01 * max_before_outlier_removal))
plt.ticklabel_format(axis="x", style="sci", scilimits=(0, 0))
plt.show()
Perform clustering
## Cluster NO2 data (average over entire period)
clustered_no2_vcd = np.nanmean(np.multiply(copy.deepcopy(no2_vcd)[:, :, :], ocean_mask), axis=0)
X = clustered_no2_vcd[~np.isnan(clustered_no2_vcd)].reshape(-1, 1)
## Calculate distortions per k
distortions = []
K = range(1, 16) # grid search of k from 1 to 15
for k in K:
kmeanModel = KMeans(init='k-means++', n_clusters=k, random_state=1).fit(X)
kmeanModel.fit(X)
distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
plt.plot(K, distortions, linewidth=3, color='#440154')
plt.grid(True)
plt.xlabel('Number of clusters', color='black')
plt.ylabel('Distortion score', color='black')
plt.show()
## Find the optimal number of clusters
kn = KneeLocator(K, distortions, curve='convex', direction='decreasing')
optimal_k = kn.knee
print("Optimal number of clusters: ", optimal_k)
## Run k-means clustering with optimal number of clusters
kmeans1 = KMeans(init='k-means++', n_clusters=optimal_k, random_state=1).fit(X)
idx = np.argsort(kmeans1.cluster_centers_.sum(axis = 1))
lut = np.zeros_like(idx)
lut[idx] = np.arange(optimal_k)
kmeans1_lab = lut[kmeans1.labels_] + 1
temp_1 = np.zeros([len(lats), len(lons)], dtype=int)
ocean_idx = np.where(ocean_mask.flatten() == 1)
# land_idx = np.where(ocean_mask.flatten() != 1)
np.put(temp_1, ocean_idx, kmeans1_lab)
no2_kmeans_label = np.multiply(temp_1, ocean_mask)
print("K-means label of NO2:\n", no2_kmeans_label)
del clustered_no2_vcd, X, K, k, kn, kmeans1, idx, lut, kmeans1_lab, temp_1
Optimal number of clusters: 4 K-means label of NO2: [[ 2. 2. 2. ... nan nan nan] [ 2. 2. 2. ... nan nan nan] [ 2. 2. 2. ... nan nan nan] ... [nan nan nan ... 2. 2. 2.] [nan nan nan ... 1. 1. 1.] [nan nan nan ... 1. 1. 1.]]
Plot clustered data
## Plot base map
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cart.feature.LAND, zorder=1, facecolor='#edc9af', alpha=1)
ax.add_feature(cart.feature.COASTLINE, linewidth=1, edgecolor='black')
ax.gridlines(draw_labels=True, linestyle='--')
## Plot clustered data
no2_clim_min, no2_clim_max = np.nanmin(no2_kmeans_label), np.nanmax(no2_kmeans_label)
plt.imshow(no2_kmeans_label, cmap = matplotlib.cm.get_cmap("viridis", optimal_k),
extent=[extent[0], extent[1], extent[2], extent[3]],
vmin = np.nanmin(no2_kmeans_label), vmax = np.nanmax(no2_kmeans_label))
cbar = plt.colorbar(fraction=0.0426, pad=0.175, label='Cluster label')
labels = np.linspace(no2_clim_min, no2_clim_max, optimal_k, dtype='int32')
loc = 0.75 * labels + 0.625
cbar.set_ticks(loc)
cbar.set_ticklabels(labels)
cbar.outline.set_edgecolor(None)
plt.show()
Perform statistical analysis
## Prepare data
NO2 = copy.deepcopy(no2_kmeans_label).reshape(-1, 1)
SHIP_TRACK = copy.deepcopy(ship_freq).reshape(-1, 1)
mask = ~np.isnan(NO2) & ~np.isnan(SHIP_TRACK)
X = NO2[mask].reshape(-1, 1)
y = SHIP_TRACK[mask]
## Pearson correlation
print('Spatial correlation between NO2 column level & ship track counts:', stats.pearsonr(NO2[mask], SHIP_TRACK[mask]))
# Fit OLS model
m2 = spreg.OLS(
# Dependent variable
y,
# Independent variables
X,
# Dependent variable name
name_y='Ship track',
# Independent variable name
name_x=['NO2 cluster w/o ConvGi*']
)
print(m2.summary)
Spatial correlation between NO2 column level & ship track counts: PearsonRResult(statistic=0.25491521162444686, pvalue=0.0)
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : unknown
Weights matrix : None
Dependent Variable : Ship track Number of Observations: 27231
Mean dependent var : 0.1240 Number of Variables : 2
S.D. dependent var : 0.1704 Degrees of Freedom : 27229
R-squared : 0.0650
Adjusted R-squared : 0.0649
Sum squared residual: 738.972 F-statistic : 1892.3572
Sigma-square : 0.027 Prob(F-statistic) : 0
S.E. of regression : 0.165 Log likelihood : 10469.974
Sigma-square ML : 0.027 Akaike info criterion : -20935.947
S.E of regression ML: 0.1647 Schwarz criterion : -20919.523
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 0.0212482 0.0025649 8.2843396 0.0000000
NO2 cluster w/o ConvGi* 0.0589597 0.0013554 43.5012321 0.0000000
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 4.936
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 65686.622 0.0000
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 1 4110.224 0.0000
Koenker-Bassett test 1 1017.322 0.0000
================================ END OF REPORT =====================================
Calculate statistics per cluster (mean & std)
## Store the cluster labels as a mask
dict_cluster = {}
for i in range(optimal_k):
tmp = copy.deepcopy(no2_kmeans_label)
tmp = tmp / (i + 1)
tmp[tmp != 1] = np.nan
dict_cluster[i + 1] = tmp
## Copy data
no2_vcd_copied = copy.deepcopy(no2_vcd)
ship_freq_copied = copy.deepcopy(ship_freq)
## Define rounding function to designated significant numbers
def round_sf(number, significant):
return round(number, significant - int(np.floor(np.log10(abs(number)))) - 1)
## Calculate mean and std per cluster
dict_stat_per_cluster = {}
for i in range(1, optimal_k+1):
dict_stat_per_cluster[i] = round_sf(np.nanmean(np.multiply(no2_vcd_copied, dict_cluster[i])), 4), round_sf(np.nanstd(np.multiply(no2_vcd_copied, dict_cluster[i])), 4)
print('Statistics of no2 measurement per cluster (mean, std):\n', dict_stat_per_cluster)
## Calculate shipping frequency per cluster
dict_ship_freq_per_cluster = {}
for i in range(1, optimal_k+1):
dict_ship_freq_per_cluster[i] = round_sf(np.nanmean(np.multiply(ship_freq_copied, dict_cluster[i])), 4), round_sf(np.nanstd(np.multiply(ship_freq_copied, dict_cluster[i])), 4)
print('Statistics of shipping frequency per cluster (mean, std):\n', dict_ship_freq_per_cluster)
del i, tmp, no2_vcd_copied, ship_freq_copied
Statistics of no2 measurement per cluster (mean, std):
{1: (1.19e-05, 5.747e-06), 2: (1.476e-05, 7.22e-06), 3: (1.924e-05, 1.3e-05), 4: (3.291e-05, 2.933e-05)}
Statistics of shipping frequency per cluster (mean, std):
{1: (0.04851, 0.05815), 2: (0.1897, 0.199), 3: (0.1548, 0.2107), 4: (0.03216, 0.01366)}
Plot box plot of clustered NO2 data
## Prepare clustered data
NO2_cluster = no2_kmeans_label.reshape(-1, 1)
SHIP_TRACK = copy.deepcopy(ship_freq).reshape(-1, 1)
mask = ~np.isnan(NO2_cluster) & ~np.isnan(SHIP_TRACK)
color_palette_viridis = [ "#440154", "#31688e", "#35b779", "#fde725"]
## Box plot of NO2 clustered data
fig_boxplot = sns.boxplot(x='NO2_cluster', y='Ship', data={'NO2_cluster': NO2_cluster[mask], 'Ship': SHIP_TRACK[mask]}, palette=color_palette_viridis,
showfliers=False, showmeans=True, meanprops={"markerfacecolor":"red", "markeredgecolor":"red"})
fig_boxplot.set(xlabel='Original NO$_2$ kmeans cluster label', ylabel='Normalized ship track count')
plt.show()
Calculate $ConvG_i^*$
# Calculate ConvGistar
no2_ConvGistar = ConvGistar(np.multiply(copy.deepcopy(no2_vcd), ocean_mask))
Plot $ConvG_i^*$
## Plot base map
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cart.feature.LAND, zorder=1, facecolor='#edc9af', alpha=1)
ax.add_feature(cart.feature.COASTLINE, linewidth=1, edgecolor='black')
ax.gridlines(draw_labels=True, linestyle='--')
## Plot clustered data
plt.imshow(copy.deepcopy(no2_ConvGistar)[888, :, :], # May 11th 2021
cmap='viridis', extent=[extent[0], extent[1], extent[2], extent[3]], vmin=-.5, vmax=.5)
plt.colorbar(cmap='viridis',
fraction=0.0426, pad=0.175, extend='both',
label='$ConvG_i^*$ of NO$_{2}$ vertical column').outline.set_edgecolor(None)
plt.show()
Build mask to filter out land & near-coast coordinates
# Build ocean_mask_far_coast
x, y = np.array(lons), np.array(lats)
xx, yy = np.meshgrid(x, y, sparse=True)
land = prep(unary_union(list(cartopy.io.shapereader.Reader(cartopy.io.shapereader.natural_earth(resolution='10m', category='physical', name='land')).geometries())))
ocean_mask_far_coast = np.empty([len(lats), len(lons)])
for i in range(len(lons)):
for j in range(len(lats)):
ocean_mask_far_coast[j, i] = not is_land(land, x[i], y[j])
land_mask = -(ocean_mask_far_coast) + 1
rad = 20 # hyperparameter of kernel radius
kernel_xx, kernel_yy = np.meshgrid(np.arange(-rad, rad + 1, 1), np.arange(-rad, rad + 1, 1))
kernel_near_coast = np.square(kernel_xx) + np.square(kernel_yy) <= np.square(rad)
ocean_mask_far_coast = -(convolve2d(land_mask.astype(int), kernel_near_coast.astype(int), mode='same').astype(bool).astype(float)) + 1
ocean_mask_far_coast[ocean_mask_far_coast == 0] = np.nan
no2_vcd_far_coast = np.multiply(copy.deepcopy(no2_vcd), ocean_mask_far_coast)
del rad
Plot near-coast filtered ocean coordinates
## Plot base map
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cart.feature.LAND, zorder=1, facecolor='#edc9af', alpha=1)
ax.add_feature(cart.feature.COASTLINE, linewidth=1, edgecolor='black')
ax.gridlines(draw_labels=True, linestyle='--')
## Plot ocean-coordinates only
plt.imshow(ocean_mask_far_coast, extent=[extent[0], extent[1], extent[2], extent[3]], cmap='viridis')
plt.show()
Plot near-coast filtered true shipping frequency
## Plot base map
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cart.feature.LAND, zorder=1, facecolor='#edc9af', alpha=1)
ax.add_feature(cart.feature.COASTLINE, linewidth=1, edgecolor='black')
ax.gridlines(draw_labels=True, linestyle='--')
## Plot true shipping frequency
img = np.multiply(ship_freq[:, :], ocean_mask_far_coast)
plt.imshow(img, extent=[extent[0], extent[1], extent[2], extent[3]], cmap='viridis')
plt.colorbar(matplotlib.cm.ScalarMappable(cmap='viridis'),
fraction=0.0426, pad = 0.175, extend='neither', ax=plt.gca(),
ticks=np.linspace(np.floor(np.nanmin(img)*100)/100, np.ceil(np.nanmax(img)*100)/100, 11),
label='Normalized ship track count').outline.set_edgecolor(None)
plt.show()
del img
Plot near-coast filtered daily measurement data
## Plot base map
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cart.feature.LAND, zorder=1, facecolor='#edc9af', alpha=1)
ax.add_feature(cart.feature.COASTLINE, linewidth=1, edgecolor='black')
ax.gridlines(draw_labels=True, linestyle='--')
## Plot daily measurement data
plt.imshow(copy.deepcopy(no2_vcd_far_coast)[888, :, :], # May 11th 2021
cmap='viridis', extent=[extent[0], extent[1], extent[2], extent[3]], vmin=0, vmax=0.00005)
plt.colorbar(cmap='viridis', fraction=0.0426, pad=0.175, extend='both',
label='NO$_{2}$ vertical column (mol m$^{-2}$)').outline.set_edgecolor(None)
plt.show()
Inspect how near-coast removal effectively removes outliers (compared to original NO2 data scatter plot)
## Prepare data
NO2 = np.nanmean(copy.deepcopy(no2_vcd_far_coast)[:, :, :], axis=0).reshape(-1, 1)
SHIP_TRACK = copy.deepcopy(ship_freq).reshape(-1, 1)
mask = ~np.isnan(NO2) & ~np.isnan(SHIP_TRACK)
## Scatter plot of NO2 data
fig_boxplot = sns.scatterplot(x=NO2[mask], y=SHIP_TRACK[mask], alpha=1, color='#440154', linewidth=0.1, s=10)
fig_boxplot.set(xlabel='NO$_2$ vertical column (mol m$^{-2}$)',
ylabel='Normalized ship track count',
ylim = (-0.02, 1.02),
xlim = (0.99 * min_before_outlier_removal, 1.01 * max_before_outlier_removal))
plt.ticklabel_format(axis="x", style="sci", scilimits=(0, 0))
plt.show()
Calculate $ConvG_i^*$
# Calculate ConvGistar
no2_vcd_ConvGistar = ConvGistar(copy.deepcopy(no2_vcd_far_coast))
Plot $ConvG_i^*$
## Plot base map
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cart.feature.LAND, zorder=1, facecolor='#edc9af', alpha=1)
ax.add_feature(cart.feature.COASTLINE, linewidth=1, edgecolor='black')
ax.gridlines(draw_labels=True, linestyle='--')
## Plot daily measurement data
plt.imshow(copy.deepcopy(no2_vcd_ConvGistar)[888, :, :], # May 11th 2021
cmap='viridis', extent=[extent[0], extent[1], extent[2], extent[3]], vmin=-.5, vmax=.5)
plt.colorbar(cmap='viridis', fraction=0.0426, pad=0.175, extend='both',
label='$ConvG_i^*$ of NO$_{2}$ vertical column').outline.set_edgecolor(None)
plt.show()
Perform clustering on $ConvG_i^*$
# NO$_{2}$ values excluding NaN
clustered_no2_vcd = np.nanmean(no2_vcd_ConvGistar, axis=0)
X = clustered_no2_vcd[~np.isnan(clustered_no2_vcd)].reshape(-1, 1)
# Run k means and store its label
kmeans_fit = KMeans(init='k-means++', n_clusters=optimal_k, random_state=1).fit(X)
idx = np.argsort(kmeans_fit.cluster_centers_.sum(axis=1))
lut = np.zeros_like(idx)
lut[idx] = np.arange(optimal_k)
kmeans_fit_lab = lut[kmeans_fit.labels_] + 1
print("K-means label of NO2:\n", kmeans_fit_lab)
# Mask k means label of land coordinates
np_zeros = np.zeros([len(lats), len(lons)], dtype=int)
ocean_idx = np.where(ocean_mask_far_coast.flatten() == 1)
np.put(np_zeros, ocean_idx, kmeans_fit_lab)
no2_ConvGistar_kmeans_label = np.multiply(np_zeros, ocean_mask_far_coast)
del clustered_no2_vcd, X, kmeans_fit, idx, lut, kmeans_fit_lab, np_zeros, ocean_idx
K-means label of NO2: [3 3 3 ... 2 2 2]
Plot clustered $ConvG_i^*$ data
## Plot base map
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cart.feature.LAND, zorder=1, facecolor='#edc9af', alpha=1)
ax.add_feature(cart.feature.COASTLINE, linewidth=1, edgecolor='black')
ax.gridlines(draw_labels=True, linestyle='--')
## Plot clustered data
no2_clim_min, no2_clim_max = np.nanmin(no2_ConvGistar_kmeans_label), np.nanmax(no2_ConvGistar_kmeans_label)
plt.imshow(no2_ConvGistar_kmeans_label, cmap = matplotlib.cm.get_cmap("viridis", optimal_k),
extent=[extent[0], extent[1], extent[2], extent[3]],
vmin = np.nanmin(no2_ConvGistar_kmeans_label), vmax = np.nanmax(no2_ConvGistar_kmeans_label))
cbar = plt.colorbar(fraction=0.0426, pad=0.175, label='Cluster label')
labels = np.linspace(no2_clim_min, no2_clim_max, optimal_k, dtype='int32')
loc = 0.75 * labels + 0.625
cbar.set_ticks(loc)
cbar.set_ticklabels(labels)
cbar.outline.set_edgecolor(None)
plt.show()
Perform statistical analysis
## Prepare data
NO2_ConvGistar_cluster = copy.deepcopy(no2_ConvGistar_kmeans_label).reshape(-1,1)
SHIP_TRACK = ship_freq.reshape(-1,1)
mask = ~np.isnan(NO2_ConvGistar_cluster) & ~np.isnan(SHIP_TRACK)
X = NO2[mask].reshape(-1, 1)
y = SHIP_TRACK[mask]
## Pearson correlation
print('Spatial correlation between NO2 column level & ship track counts:', stats.pearsonr(NO2_ConvGistar_cluster[mask], SHIP_TRACK[mask]))
# Fit OLS model
m3 = spreg.OLS(
# Dependent variable
y,
# Independent variables
X,
# Dependent variable name
name_y='Ship track',
# Independent variable name
name_x=['NO2 cluster with ConvGi*']
)
print(m3.summary)
Spatial correlation between NO2 column level & ship track counts: PearsonRResult(statistic=0.6235099013736775, pvalue=0.0)
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : unknown
Weights matrix : None
Dependent Variable : Ship track Number of Observations: 11162
Mean dependent var : 0.2076 Number of Variables : 2
S.D. dependent var : 0.2251 Degrees of Freedom : 11160
R-squared : 0.3919
Adjusted R-squared : 0.3919
Sum squared residual: 343.730 F-statistic : 7193.3962
Sigma-square : 0.031 Prob(F-statistic) : 0
S.E. of regression : 0.175 Log likelihood : 3585.994
Sigma-square ML : 0.031 Akaike info criterion : -7167.988
S.E of regression ML: 0.1755 Schwarz criterion : -7153.347
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT -0.9741541 0.0140322 -69.4226693 0.0000000
NO2 cluster with ConvGi* 82967.5574506 978.2307568 84.8138917 0.0000000
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 16.835
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 3122.068 0.0000
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 1 4648.023 0.0000
Koenker-Bassett test 1 2375.033 0.0000
================================ END OF REPORT =====================================
Calculate statistics per cluster (mean & std)
## Store the cluster labels as a mask
dict_ConvGistar_cluster = {}
for i in range(optimal_k):
tmp = copy.deepcopy(no2_ConvGistar_kmeans_label)
tmp = tmp / (i + 1)
tmp[tmp != 1] = np.nan
dict_ConvGistar_cluster[i + 1] = tmp
## Copy data
no2_vcd_copied = copy.deepcopy(no2_vcd)
ship_freq_copied = copy.deepcopy(ship_freq)
## Define rounding function to designated significant numbers
def round_sf(number, significant):
return round(number, significant - int(np.floor(np.log10(abs(number)))) - 1)
## Calculate mean and std per cluster
dict_stat_per_cluster = {}
for i in range(1, optimal_k+1):
dict_stat_per_cluster[i] = round_sf(np.nanmean(np.multiply(no2_vcd_copied, dict_ConvGistar_cluster[i])), 4), round_sf(np.nanstd(np.multiply(no2_vcd_copied, dict_ConvGistar_cluster[i])), 4)
print('Statistics of no2 measurement per cluster (mean, std):\n', dict_stat_per_cluster)
## Calculate shipping frequency per cluster
dict_ship_freq_per_cluster = {}
for i in range(1, optimal_k+1):
dict_ship_freq_per_cluster[i] = round_sf(np.nanmean(np.multiply(ship_freq_copied, dict_ConvGistar_cluster[i])), 4), round_sf(np.nanstd(np.multiply(ship_freq_copied, dict_ConvGistar_cluster[i])), 4)
print('Statistics of shipping frequency per cluster (mean, std):\n', dict_ship_freq_per_cluster)
del i, tmp, no2_vcd_copied, ship_freq_copied
Statistics of no2 measurement per cluster (mean, std):
{1: (1.201e-05, 5.902e-06), 2: (1.343e-05, 6.607e-06), 3: (1.485e-05, 7.577e-06), 4: (1.681e-05, 9.197e-06)}
Statistics of shipping frequency per cluster (mean, std):
{1: (0.05392, 0.06413), 2: (0.07848, 0.08036), 3: (0.2899, 0.2203), 4: (0.435, 0.2433)}
Plot box plot of clustered $ConvG_i^*$ data
## Prepare clustered data
NO2_cluster = no2_ConvGistar_kmeans_label.reshape(-1, 1)
SHIP_TRACK = copy.deepcopy(ship_freq).reshape(-1, 1)
mask = ~np.isnan(NO2_cluster) & ~np.isnan(SHIP_TRACK)
## Box plot of NO2 clustered data
fig_boxplot = sns.boxplot(x='NO2_cluster', y='Ship', data={'NO2_cluster': NO2_cluster[mask], 'Ship': SHIP_TRACK[mask]}, palette=color_palette_viridis,
showfliers=False, showmeans=True, meanprops={"markerfacecolor":"red", "markeredgecolor":"red"})
fig_boxplot.set(xlabel='NO$_2$ $ConvG_i^*$ kmeans cluster label', ylabel='Normalized ship track count')
plt.show()
For temporal analysis, we focus on the comparison between shipping route (cluster with the highest label) and background (cluster with the lowest label).
Plot shipping route (cluster with the highest label) from clustered $ConvG_i^*$ data
## Plot base map
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cart.feature.LAND, zorder=1, facecolor='#edc9af', alpha=1)
ax.add_feature(cart.feature.COASTLINE, linewidth=1, edgecolor='black')
ax.gridlines(draw_labels=True, linestyle='--')
## Build shipping-route mask
no2_ConvGistar_shipping = copy.deepcopy(no2_ConvGistar_kmeans_label)
no2_ConvGistar_shipping[no2_ConvGistar_shipping != optimal_k] = np.nan
no2_ConvGistar_shipping = no2_ConvGistar_shipping / optimal_k
## Plot clustered data masked by shipping-route mask
plt.imshow(no2_ConvGistar_shipping, cmap = matplotlib.cm.get_cmap("viridis", optimal_k),
extent=[extent[0], extent[1], extent[2], extent[3]])
plt.title('Shipping-route cluster')
plt.show()
Plot background (cluster with the lowest label) from clustered $ConvG_i^*$ data
## Plot base map
ax = plt.axes(projection=ccrs.PlateCarree())
ax.add_feature(cart.feature.LAND, zorder=1, facecolor='#edc9af', alpha=1)
ax.add_feature(cart.feature.COASTLINE, linewidth=1, edgecolor='black')
ax.gridlines(draw_labels=True, linestyle='--')
## Build background mask
no2_ConvGistar_background = copy.deepcopy(no2_ConvGistar_kmeans_label)
no2_ConvGistar_background[no2_ConvGistar_background != 1] = np.nan
# no2_ConvGistar_background = no2_ConvGistar_shipping / 1
## Plot clustered data masked by background mask
plt.imshow(no2_ConvGistar_background, cmap = matplotlib.cm.get_cmap("viridis", optimal_k),
extent=[extent[0], extent[1], extent[2], extent[3]])
plt.title('Background cluster')
plt.show()
Generate monthly timestamp
## Generate monthly timestamp
## Monthly timestamp: first-day indices from Dec. 5th 2018 to Nov. 10th 2021 (total 1,072 days)
month_index = [0,
27, 58, 86, 117, 147, 178, 208, 239, 270, 300, 331, 361,
392, 423, 452, 483, 513, 544, 574, 605, 636, 666, 697, 727,
758, 789, 817, 848, 878, 909, 939, 970, 1001, 1031, 1062, 1072]
month_name = ['Dec. 2018',
'Jan. 2019', 'Feb. 2019', 'Mar. 2019', 'Apr. 2019', 'May. 2019', 'Jun. 2019', 'Jul. 2019', 'Aug. 2019', 'Sep. 2019', 'Oct. 2019', 'Nov. 2019', 'Dec. 2019',
'Jan. 2020', 'Feb. 2020', 'Mar. 2020', 'Apr. 2020', 'May. 2020', 'Jun. 2020', 'Jul. 2020', 'Aug. 2020', 'Sep. 2020', 'Oct. 2020', 'Nov. 2020', 'Dec. 2020',
'Jan. 2021', 'Feb. 2021', 'Mar. 2021', 'Apr. 2021', 'May. 2021', 'Jun. 2021', 'Jul. 2021', 'Aug. 2021', 'Sep. 2021', 'Oct. 2021', 'Nov. 2021']
month = {}
for i in range(len(month_index) - 1):
month["{0}".format(i)] = range(month_index[i], month_index[i + 1])
Calculate monthly statistics per clusters (e.g. shipping route vs. background)
## Calculate monthly mean & std
no2_monthly_mean_shipping = {}
no2_monthly_mean_background = {}
no2_monthly_std_shipping = {}
no2_monthly_std_background = {}
for i in range(len(month_index) - 1):
no2_monthly_mean_shipping["{0}".format(i)] = np.nanmean(np.nanmean(np.multiply(copy.deepcopy(no2_vcd)[month[str(i)], :, :], no2_ConvGistar_shipping), axis=0))
no2_monthly_mean_background["{0}".format(i)] = np.nanmean(np.nanmean(np.multiply(copy.deepcopy(no2_vcd)[month[str(i)], :, :], no2_ConvGistar_background), axis=0))
no2_monthly_std_shipping["{0}".format(i)] = np.nanstd(np.nanmean(np.multiply(copy.deepcopy(no2_vcd)[month[str(i)], :, :], no2_ConvGistar_shipping), axis=0))
no2_monthly_std_background["{0}".format(i)] = np.nanstd(np.nanmean(np.multiply(copy.deepcopy(no2_vcd)[month[str(i)], :, :], no2_ConvGistar_background), axis=0))
Plot the trend of monthly statistics in shipping route cluster
## Prepare data
x = np.array(["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
y1 = [*no2_monthly_mean_shipping.values()][1:13]
y2 = [*no2_monthly_mean_shipping.values()][13:25]
y3 = [*no2_monthly_mean_shipping.values()][25:]
y1err = [*no2_monthly_std_shipping.values()][1:13]
y2err = [*no2_monthly_std_shipping.values()][13:25]
y3err = [*no2_monthly_std_shipping.values()][25:]
## Plot trend time series
plt.figure(figsize=(14, 7))
plt.plot(x, y1, label='Monthly mean of shipping route 2019', color='#22a884')
plt.errorbar(x, y1, yerr=y1err, fmt='o', capsize=5, alpha=0.5, color='#22a884')
plt.plot(x, y2, label='Monthly mean of shipping route 2020', color='#7ad151')
plt.errorbar(x, y2, yerr=y1err, fmt='o', capsize=5, alpha=0.5, color='#7ad151')
plt.plot(range(len(y3)), y3, label='Monthly mean of shipping route 2021', color='#fde725')
plt.errorbar(range(len(y3)), y3, yerr=y3err, fmt='o', capsize=5, alpha=0.5, color='#fde725')
plt.ticklabel_format(axis='y', style='sci', scilimits=(-5, -5))
plt.xticks(rotation = 90)
plt.ylabel('NO$_2$ vertical column (mol m$^{-2}$)')
plt.legend()
plt.tight_layout()
plt.show()
Plot the trend of monthly statistics in background cluster
## Prepare data
x = np.array(["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
y1 = [*no2_monthly_mean_background.values()][1:13]
y2 = [*no2_monthly_mean_background.values()][13:25]
y3 = [*no2_monthly_mean_background.values()][25:]
y1err = [*no2_monthly_std_background.values()][1:13]
y2err = [*no2_monthly_std_background.values()][13:25]
y3err = [*no2_monthly_std_background.values()][25:]
## Plot trend time series
plt.figure(figsize=(14, 7))
plt.plot(x, y1, label='Monthly mean of background 2019', color='#440154')
plt.errorbar(x, y1, yerr=y1err, fmt='o', capsize=5, alpha=0.5, color='#440154')
plt.plot(x, y2, label='Monthly mean of background 2020', color='#21918c')
plt.errorbar(x, y2, yerr=y1err, fmt='o', capsize=5, alpha=0.5, color='#21918c')
plt.plot(range(len(y3)), y3, label='Monthly mean of background 2021', color='#5ec962')
plt.errorbar(range(len(y3)), y3, yerr=y3err, fmt='o', capsize=5, alpha=0.5, color='#5ec962')
plt.ticklabel_format(axis='y', style='sci', scilimits=(-5, -5))
plt.xticks(rotation = 90)
plt.ylabel('NO$_2$ vertical column (mol m$^{-2}$)')
plt.legend()
plt.tight_layout()
plt.show()
Plot the trend of monthly statistics in shipping route cluster & background cluster
## Prepare data
x = np.array(["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
y1_ship = [*no2_monthly_mean_shipping.values()][1:13]
y2_ship = [*no2_monthly_mean_shipping.values()][13:25]
y3_ship = [*no2_monthly_mean_shipping.values()][25:]
y1_back = [*no2_monthly_mean_background.values()][1:13]
y2_back = [*no2_monthly_mean_background.values()][13:25]
y3_back = [*no2_monthly_mean_background.values()][25:]
y1err_ship = [*no2_monthly_std_shipping.values()][1:13]
y2err_ship = [*no2_monthly_std_shipping.values()][13:25]
y3err_ship = [*no2_monthly_std_shipping.values()][25:]
y1err_back = [*no2_monthly_std_background.values()][1:13]
y2err_back = [*no2_monthly_std_background.values()][13:25]
y3err_back = [*no2_monthly_std_background.values()][25:]
## Plot trend time series
plt.figure(figsize=(14, 7))
plt.plot(x, y1_ship, label='Monthly NO$_2$ mean 2019 - ship', color='#22a884')
plt.errorbar(x, y1_ship, yerr=y1err_ship, fmt='o', capsize=5, alpha=0.5, color='#22a884')
plt.plot(x, y2_ship, label='Monthly NO$_2$ mean 2020 - ship', color='#7ad151')
plt.errorbar(x, y2_ship, yerr=y2err_ship, fmt='o', capsize=5, alpha=0.5, color='#7ad151')
plt.plot(range(len(y3_ship)), y3_ship, label='Monthly NO$_2$ mean 2021 - ship', color='#fde725')
plt.errorbar(range(len(y3_ship)), y3_ship, yerr=y3err_ship, fmt='o', capsize=5, alpha=0.5, color='#fde725')
plt.plot(x, y1_back, label='Monthly NO$_2$ mean 2019 - back', color='#440154')
plt.errorbar(x, y1_back, yerr=y1err_back, fmt='o', capsize=5, alpha=0.5, color='#440154')
plt.plot(x, y2_back, label='Monthly NO$_2$ mean 2020 - back', color='#414487')
plt.errorbar(x, y2_back, yerr=y2err_back, fmt='o', capsize=5, alpha=0.5, color='#414487')
plt.plot(range(len(y3_back)), y3_back, label='Monthly NO$_2$ mean 2021 - back', color='#2a788e')
plt.errorbar(range(len(y3_back)), y3_back, yerr=y3err_back, fmt='o', capsize=5, alpha=0.5, color='#2a788e')
plt.ticklabel_format(axis='y', style='sci', scilimits=(-5, -5))
plt.xticks(rotation = 90)
plt.ylabel("NO$_2$ vertical column (mol m$^{-2}$)")
plt.legend()
plt.tight_layout()
plt.show()
Plot the trend of monthly statistics in shipping route cluster, compared with the global container throughput index
## Prepare data
x = np.array(month_name)
y_ship = [*no2_monthly_mean_shipping.values()]
y_back = [*no2_monthly_mean_background.values()]
yerr_ship = [*no2_monthly_std_shipping.values()]
yerr_back = [*no2_monthly_std_background.values()]
global_throughput_index = pd.read_excel('https://www.isl.org/public/containerumschlag-index/2023-04/containerumschlag-index_230531.xlsx')
start = global_throughput_index.index[global_throughput_index.iloc[:, 0] == datetime.datetime(2018, 12, 1, 0, 0)].item()
end = global_throughput_index.index[global_throughput_index.iloc[:, 0] == datetime.datetime(2021, 11, 1, 0, 0)].item()
global_throughput_original = global_throughput_index.iloc[start:(end + 1), 1].tolist()
## Plot trend time series
fig, ax1 = plt.subplots(figsize=(14, 7))
plt.xticks(rotation = 90)
plt.ylabel("NO$_2$ vertical column (mol m$^{-2}$)")
ax2 = ax1.twinx()
ax1.plot(x, y_ship, label='Monthly mean NO$_2$ vertical column - shipping route', color='#440154')
ax1.errorbar(x, y_ship, yerr=yerr_ship, fmt='o', capsize=5, alpha=0.5, color='#440154')
ax1.ticklabel_format(axis='y', style='sci', scilimits=(-5, -5))
ax2.plot(x, global_throughput_original, label='RWI/ISL container throughput index - original', color='#21918c')
ax2.grid(False)
ax2.set_ylim([90, 140])
plt.ylabel("Container throughput index (index$_{2015}$=100)")
c1_patch = mpatches.Patch(color='#440154', label='Monthly mean NO$_2$ vertical column - shipping route')
c2_patch = mpatches.Patch(color='#21918c', label='RWI/ISL container throughput index - original')
plt.legend(handles=[c1_patch, c2_patch], loc='upper right')
plt.tight_layout()
plt.show()
Perform seasonal decomposition on the data
## Perform seasonal decomposition
seasonal_decomp = statsmodels.tsa.seasonal.seasonal_decompose(y_ship, model='additive', period=12)
## Plot seasonality
plt.figure(figsize=(14, 7))
plt.plot(seasonal_decomp.seasonal)
plt.xticks(np.arange(0, 36), pd.Series(month_name), rotation = 90)
plt.legend(['Seasonality of monthly mean NO$_2$ vertical column - shipping route'], loc='upper right')
plt.show()
## Plot deseasonalized NO2 data
plt.figure(figsize=(14, 7))
plt.plot(y_ship - seasonal_decomp.seasonal)
plt.xticks(np.arange(0, 36), pd.Series(month_name), rotation = 90)
plt.legend(['Deseasonalized monthly mean NO$_2$ vertical column - shipping route'], loc='upper right')
plt.show()
Plot the trend of deseasonalized monthly statistics in shipping route cluster, compared with the global container throughput adjusted index
## Prepare data
x = np.array(month_name)
y_ship = [*no2_monthly_mean_shipping.values()]
y_back = [*no2_monthly_mean_background.values()]
yerr_ship = [*no2_monthly_std_shipping.values()]
yerr_back = [*no2_monthly_std_background.values()]
global_throughput_index = pd.read_excel('https://www.isl.org/public/containerumschlag-index/2023-04/containerumschlag-index_230531.xlsx')
start = global_throughput_index.index[global_throughput_index.iloc[:, 0] == datetime.datetime(2018, 12, 1, 0, 0)].item()
end = global_throughput_index.index[global_throughput_index.iloc[:, 0] == datetime.datetime(2021, 11, 1, 0, 0)].item()
global_throughput_adjusted = global_throughput_index.iloc[start:(end + 1), 2].tolist()
## Plot trend time series
fig, ax1 = plt.subplots(figsize=(14, 7))
plt.xticks(rotation = 90)
plt.ylabel("NO$_2$ vertical column (mol m$^{-2}$)")
ax2 = ax1.twinx()
ax1.plot(x, y_ship - seasonal_decomp.seasonal, label='Deseasonalized monthly mean NO$_2$ vertical column - shipping route', color='#440154')
ax1.errorbar(x, y_ship - seasonal_decomp.seasonal, yerr=yerr_ship, fmt='o', capsize=5, alpha=0.5, color='#440154')
ax1.ticklabel_format(axis='y', style='sci', scilimits=(-4, -4))
ax2.plot(x, global_throughput_adjusted, label='RWI/ISL container throughput index - adjusted', color='#21918c')
ax2.grid(False)
ax2.set_ylim([90, 140])
plt.ylabel("Container throughput index (index$_{2015}$=100)")
c1_patch = mpatches.Patch(color='#440154', label='Deseasonalized monthly mean NO$_2$ vertical column - shipping route')
c2_patch = mpatches.Patch(color='#21918c', label='RWI/ISL container throughput index - adjusted')
plt.legend(handles=[c1_patch, c2_patch], loc='upper right')
plt.tight_layout()
plt.show()
Perform statistical analysis
## Monthly mean NO2 on shipping route vs. global throughput index
## Pearson correlation
print('Temporal Pearson correlation between shipping & throughput index (original):', stats.pearsonr(y_ship, global_throughput_original))
print('Temporal Pearson correlation between deseasonalized shipping & throughput index (adjusted):', stats.pearsonr(y_ship - seasonal_decomp.seasonal, global_throughput_adjusted))
Temporal Pearson correlation between shipping & throughput index (original): PearsonRResult(statistic=0.17019354794446567, pvalue=0.3210086750037733) Temporal Pearson correlation between deseasonalized shipping & throughput index (adjusted): PearsonRResult(statistic=0.6750764374545704, pvalue=6.2965013285031515e-06)
Calculate yearly mean & std per cluster
## Prepare data for shipping-route cluster
data_shipping = np.multiply(copy.deepcopy(no2_vcd)[:, :, :], no2_ConvGistar_shipping)
data_shipping_2019 = data_shipping[27:(27 + 365), :, :]
data_shipping_2020 = data_shipping[(27 + 365):(27 + 365 + 366), :, :]
data_shipping_2021 = data_shipping[(27 + 365 + 366):(27 + 365 + 366 + 366), :, :]
## 2019 NO2 column density
print('NO2 mean (2019):', round_sf(np.nanmean(data_shipping_2019), 4), '\nNO2 std (2019):', round_sf(np.nanstd(data_shipping_2019), 4))
## 2020 NO2 column density
print('NO2 mean (2020):', round_sf(np.nanmean(data_shipping_2020), 4), '\nNO2 std (2020):', round_sf(np.nanstd(data_shipping_2020), 4))
## 2021 NO2 column density
print('NO2 mean (2021):', round_sf(np.nanmean(data_shipping_2021), 4), '\nNO2 std (2021):', round_sf(np.nanstd(data_shipping_2021), 4))
NO2 mean (2019): 1.661e-05 NO2 std (2019): 9.476e-06 NO2 mean (2020): 1.627e-05 NO2 std (2020): 9.098e-06 NO2 mean (2021): 1.75e-05 NO2 std (2021): 8.984e-06
## 2020/2019 ratio
ratio_shipping_2020_2019 = np.nanmean(data_shipping_2020, axis=0) / np.nanmean(data_shipping_2019, axis=0)
print('NO2 ratio mean (2020/2019):', round_sf(np.nanmean(ratio_shipping_2020_2019), 4), '\nNO2 ratio std (2020/2019):', round_sf(np.nanstd(ratio_shipping_2020_2019), 4))
## 2021/2020 ratio
ratio_shipping_2021_2020 = np.nanmean(data_shipping_2021, axis=0) / np.nanmean(data_shipping_2020, axis=0)
print('NO2 ratio mean (2021/2020):', round_sf(np.nanmean(ratio_shipping_2021_2020), 4), '\nNO2 ratio std (2021/2020):', round_sf(np.nanstd(ratio_shipping_2021_2020), 4))
## 2020/2019 ratio
ratio_shipping_2021_2019 = np.nanmean(data_shipping_2021, axis=0) / np.nanmean(data_shipping_2019, axis=0)
print('NO2 ratio mean (2021/2019):', round_sf(np.nanmean(ratio_shipping_2021_2019), 4), '\nNO2 ratio std (2021/2019):', round_sf(np.nanstd(ratio_shipping_2021_2019), 4))
NO2 ratio mean (2020/2019): 0.9799 NO2 ratio std (2020/2019): 0.03766 NO2 ratio mean (2021/2020): 1.077 NO2 ratio std (2021/2020): 0.0496 NO2 ratio mean (2021/2019): 1.055 NO2 ratio std (2021/2019): 0.05015
Calculate yearly mean & std per cluster
## Prepare data for background cluster
data_background = np.multiply(copy.deepcopy(no2_vcd)[:, :, :], no2_ConvGistar_background)
data_background_2019 = data_background[27:(27 + 365), :, :]
data_background_2020 = data_background[(27 + 365):(27 + 365 + 366), :, :]
data_background_2021 = data_background[(27 + 365 + 366):(27 + 365 + 366 + 366), :, :]
## 2019 NO2 column density
print('NO2 mean (2019):', round_sf(np.nanmean(data_background_2019), 4), '\nNO2 std (2019):', round_sf(np.nanstd(data_background_2019), 4))
## 2020 NO2 column density
print('NO2 mean (2020):', round_sf(np.nanmean(data_background_2020), 4), '\nNO2 std (2020):', round_sf(np.nanstd(data_background_2020), 4))
## 2021 NO2 column density
print('NO2 mean (2021):', round_sf(np.nanmean(data_background_2021), 4), '\nNO2 std (2021):', round_sf(np.nanstd(data_background_2021), 4))
NO2 mean (2019): 1.16e-05 NO2 std (2019): 5.712e-06 NO2 mean (2020): 1.168e-05 NO2 std (2020): 6.087e-06 NO2 mean (2021): 1.273e-05 NO2 std (2021): 5.837e-06
## 2020/2019 ratio
ratio_background_2020_2019 = np.nanmean(data_background_2020, axis=0) / np.nanmean(data_background_2019, axis=0)
print('NO2 ratio mean (2020/2019):', round_sf(np.nanmean(ratio_background_2020_2019), 4), '\nNO2 ratio std (2020/2019):', round_sf(np.nanstd(ratio_background_2020_2019), 4))
## 2021/2020 ratio
ratio_background_2021_2020 = np.nanmean(data_background_2021, axis=0) / np.nanmean(data_background_2020, axis=0)
print('NO2 ratio mean (2021/2020):', round_sf(np.nanmean(ratio_background_2021_2020), 4), '\nNO2 ratio std (2021/2020):', round_sf(np.nanstd(ratio_background_2021_2020), 4))
## 2020/2019 ratio
ratio_background_2021_2019 = np.nanmean(data_background_2021, axis=0) / np.nanmean(data_background_2019, axis=0)
print('NO2 ratio mean (2021/2019):', round_sf(np.nanmean(ratio_background_2021_2019), 4), '\nNO2 ratio std (2021/2019):', round_sf(np.nanstd(ratio_background_2021_2019), 4))
NO2 ratio mean (2020/2019): 1.008 NO2 ratio std (2020/2019): 0.03899 NO2 ratio mean (2021/2020): 1.091 NO2 ratio std (2021/2020): 0.05854 NO2 ratio mean (2021/2019): 1.099 NO2 ratio std (2021/2019): 0.04806
## Load TROPOMI dataset (netCDF)
no2_nc_filename = "./dataset/cube_no2_shipping_RedSea.nc"
## Read netCDF file
ds = nc.Dataset(no2_nc_filename, 'r')
## Store dimension information (time, latitude, and longitude)
dim_names = list(ds.dimensions) # get variable keys
time, lats, lons = ds.variables[dim_names[0]][:], ds.variables[dim_names[1]][:], ds.variables[dim_names[2]][:]
print("Dimensions of netCDF NO2 data:")
for dim in ds.dimensions.items():
print(dim)
## Store NO2 column measurements from netCDF dataset
product = ds['/PRODUCT/SUPPORT_DATA/DETAILED_RESULTS']
no2_scd_attr = list(product.variables.items())
no2_scd_name = no2_scd_attr[0][0]
no2_scd = product.variables[no2_scd_name][:len(time)] # vertical column
## Close netCDF file
ds.close()
## Crop the dataset and dimension w.r.t. the region of interest
## If interested in a smaller ROI, change lats_roi = (0, lats_roi), and lons_roi = (0, lons_roi) where lats_roi <= lats.size and lons_roi <= lons.size
lats_roi = (0, lats.size)
lons_roi = (0, lons.size)
lats, lons = lats[lats_roi[0]:lats_roi[1]], lons[lons_roi[0]:lons_roi[1]]
no2_scd = no2_scd[:, lats_roi[0]:lats_roi[1], lons_roi[0]:lons_roi[1]]
extent = [min(lons), max(lons), min(lats), max(lats)] # coordinate extent of given netCDF dataset
## only run when Indian Ocean
# no2_scd[330, :, 160:] = np.nan ## Part of Indian Ocean dataset on 29 October 2019 should be masked due to large noise
del dim_names, dim, ds, product, no2_scd_attr, no2_scd_name
Dimensions of netCDF NO2 data:
('Time', <class 'netCDF4._netCDF4.Dimension'> (unlimited): name = 'Time', size = 1072)
('Latitude', <class 'netCDF4._netCDF4.Dimension'>: name = 'Latitude', size = 320)
('Longitude', <class 'netCDF4._netCDF4.Dimension'>: name = 'Longitude', size = 320)
Calculate monthly statistics per clusters (e.g. shipping route vs. background)
## Calculate monthly mean & std
no2_monthly_mean_shipping = {}
no2_monthly_mean_background = {}
no2_monthly_std_shipping = {}
no2_monthly_std_background = {}
for i in range(len(month_index) - 1):
no2_monthly_mean_shipping["{0}".format(i)] = np.nanmean(np.nanmean(np.multiply(copy.deepcopy(no2_scd)[month[str(i)], :, :], no2_ConvGistar_shipping), axis=0))
no2_monthly_mean_background["{0}".format(i)] = np.nanmean(np.nanmean(np.multiply(copy.deepcopy(no2_scd)[month[str(i)], :, :], no2_ConvGistar_background), axis=0))
no2_monthly_std_shipping["{0}".format(i)] = np.nanstd(np.nanmean(np.multiply(copy.deepcopy(no2_scd)[month[str(i)], :, :], no2_ConvGistar_shipping), axis=0))
no2_monthly_std_background["{0}".format(i)] = np.nanstd(np.nanmean(np.multiply(copy.deepcopy(no2_scd)[month[str(i)], :, :], no2_ConvGistar_background), axis=0))
Plot the trend of monthly statistics in shipping route cluster, compared with the global container throughput index
## Prepare data
x = np.array(month_name)
y_ship = [*no2_monthly_mean_shipping.values()]
y_back = [*no2_monthly_mean_background.values()]
yerr_ship = [*no2_monthly_std_shipping.values()]
yerr_back = [*no2_monthly_std_background.values()]
global_throughput_index = pd.read_excel('https://www.isl.org/public/containerumschlag-index/2023-04/containerumschlag-index_230531.xlsx')
start = global_throughput_index.index[global_throughput_index.iloc[:, 0] == datetime.datetime(2018, 12, 1, 0, 0)].item()
end = global_throughput_index.index[global_throughput_index.iloc[:, 0] == datetime.datetime(2021, 11, 1, 0, 0)].item()
global_throughput_original = global_throughput_index.iloc[start:(end + 1), 1].tolist()
## Plot trend time series
fig, ax1 = plt.subplots(figsize=(14, 7))
plt.xticks(rotation = 90)
plt.ylabel("NO$_2$ slant column (mol m$^{-2}$)")
ax2 = ax1.twinx()
ax1.plot(x, y_ship, label='Monthly mean NO$_2$ slant column - shipping route', color='#440154')
ax1.errorbar(x, y_ship, yerr=yerr_ship, fmt='o', capsize=5, alpha=0.5, color='#440154')
ax1.ticklabel_format(axis='y', style='sci', scilimits=(-4, -4))
ax2.plot(x, global_throughput_original, label='RWI/ISL container throughput index - original', color='#21918c')
ax2.grid(False)
ax2.set_ylim([90, 140])
plt.ylabel("Container throughput index (index$_{2015}$=100)")
c1_patch = mpatches.Patch(color='#440154', label='Monthly mean NO$_2$ slant column - shipping route')
c2_patch = mpatches.Patch(color='#21918c', label='RWI/ISL container throughput index - original')
plt.legend(handles=[c1_patch, c2_patch], loc='upper right')
plt.tight_layout()
plt.show()
Perform seasonal decomposition on the data
## Perform seasonal decomposition
seasonal_decomp = statsmodels.tsa.seasonal.seasonal_decompose(y_ship, model='additive', period=12)
## Plot seasonality
plt.figure(figsize=(14, 7))
plt.plot(seasonal_decomp.seasonal)
plt.xticks(np.arange(0, 36), pd.Series(month_name), rotation = 90)
plt.legend(['Seasonality of monthly mean NO$_2$ slant column - shipping route'], loc='upper right')
plt.show()
## Plot deseasonalized NO2 data
plt.figure(figsize=(14, 7))
plt.plot(y_ship - seasonal_decomp.seasonal)
plt.xticks(np.arange(0, 36), pd.Series(month_name), rotation = 90)
plt.legend(['Deseasonalized monthly mean NO$_2$ slant column - shipping route'], loc='upper right')
plt.show()
Plot the trend of deseasonalized monthly statistics in shipping route cluster, compared with the global container throughput adjusted index
## Prepare data
x = np.array(month_name)
y_ship = [*no2_monthly_mean_shipping.values()]
y_back = [*no2_monthly_mean_background.values()]
yerr_ship = [*no2_monthly_std_shipping.values()]
yerr_back = [*no2_monthly_std_background.values()]
global_throughput_index = pd.read_excel('https://www.isl.org/public/containerumschlag-index/2023-04/containerumschlag-index_230531.xlsx')
start = global_throughput_index.index[global_throughput_index.iloc[:, 0] == datetime.datetime(2018, 12, 1, 0, 0)].item()
end = global_throughput_index.index[global_throughput_index.iloc[:, 0] == datetime.datetime(2021, 11, 1, 0, 0)].item()
global_throughput_adjusted = global_throughput_index.iloc[start:(end + 1), 2].tolist()
## Plot trend time series
fig, ax1 = plt.subplots(figsize=(14, 7))
plt.xticks(rotation = 90)
plt.ylabel("NO$_2$ slant column (mol m$^{-2}$)")
ax2 = ax1.twinx()
ax1.plot(x, y_ship - seasonal_decomp.seasonal, label='Deseasonalized monthly mean NO$_2$ slant column - shipping route', color='#440154')
ax1.errorbar(x, y_ship - seasonal_decomp.seasonal, yerr=yerr_ship, fmt='o', capsize=5, alpha=0.5, color='#440154')
ax1.ticklabel_format(axis='y', style='sci', scilimits=(-4, -4))
ax2.plot(x, global_throughput_adjusted, label='RWI/ISL container throughput index - adjusted', color='#21918c')
ax2.grid(False)
ax2.set_ylim([90, 140])
plt.ylabel("Container throughput index (index$_{2015}$=100)")
c1_patch = mpatches.Patch(color='#440154', label='Deseasonalized monthly mean NO$_2$ slant column - shipping route')
c2_patch = mpatches.Patch(color='#21918c', label='RWI/ISL container throughput index - adjusted')
plt.legend(handles=[c1_patch, c2_patch], loc='upper right')
plt.tight_layout()
plt.show()
Perform statistical analysis
## Monthly mean NO2 on shipping route vs. global throughput index
## Pearson correlation
print('Temporal Pearson correlation between shipping & throughput index (original):', stats.pearsonr(y_ship, global_throughput_original))
print('Temporal Pearson correlation between deseasonalized shipping & throughput index (adjusted):', stats.pearsonr(y_ship - seasonal_decomp.seasonal, global_throughput_adjusted))
Temporal Pearson correlation between shipping & throughput index (original): PearsonRResult(statistic=0.6042468732311803, pvalue=9.519631104389306e-05) Temporal Pearson correlation between deseasonalized shipping & throughput index (adjusted): PearsonRResult(statistic=0.7057877159188211, pvalue=1.5235283878666406e-06)
Calculate yearly mean & std per cluster
## Prepare data for shipping-route cluster
data_shipping = np.multiply(copy.deepcopy(no2_scd)[:, :, :], no2_ConvGistar_shipping)
data_shipping_2019 = data_shipping[27:(27 + 365), :, :]
data_shipping_2020 = data_shipping[(27 + 365):(27 + 365 + 366), :, :]
data_shipping_2021 = data_shipping[(27 + 365 + 366):(27 + 365 + 366 + 366), :, :]
## 2019 NO2 column density
print('NO2 mean (2019):', round_sf(np.nanmean(data_shipping_2019), 5), '\nNO2 std (2019):', round_sf(np.nanstd(data_shipping_2019), 4))
## 2020 NO2 column density
print('NO2 mean (2020):', round_sf(np.nanmean(data_shipping_2020), 5), '\nNO2 std (2020):', round_sf(np.nanstd(data_shipping_2020), 4))
## 2021 NO2 column density
print('NO2 mean (2021):', round_sf(np.nanmean(data_shipping_2021), 5), '\nNO2 std (2021):', round_sf(np.nanstd(data_shipping_2021), 4))
NO2 mean (2019): 0.00012139 NO2 std (2019): 2.194e-05 NO2 mean (2020): 0.00011606 NO2 std (2020): 1.815e-05 NO2 mean (2021): 0.00012652 NO2 std (2021): 1.942e-05
## 2020/2019 ratio
ratio_shipping_2020_2019 = np.nanmean(data_shipping_2020, axis=0) / np.nanmean(data_shipping_2019, axis=0)
print('NO2 ratio mean (2020/2019):', round_sf(np.nanmean(ratio_shipping_2020_2019), 4), '\nNO2 ratio std (2020/2019):', round_sf(np.nanstd(ratio_shipping_2020_2019), 4))
## 2021/2020 ratio
ratio_shipping_2021_2020 = np.nanmean(data_shipping_2021, axis=0) / np.nanmean(data_shipping_2020, axis=0)
print('NO2 ratio mean (2021/2020):', round_sf(np.nanmean(ratio_shipping_2021_2020), 4), '\nNO2 ratio std (2021/2020):', round_sf(np.nanstd(ratio_shipping_2021_2020), 4))
## 2020/2019 ratio
ratio_shipping_2021_2019 = np.nanmean(data_shipping_2021, axis=0) / np.nanmean(data_shipping_2019, axis=0)
print('NO2 ratio mean (2021/2019):', round_sf(np.nanmean(ratio_shipping_2021_2019), 4), '\nNO2 ratio std (2021/2019):', round_sf(np.nanstd(ratio_shipping_2021_2019), 4))
NO2 ratio mean (2020/2019): 0.9556 NO2 ratio std (2020/2019): 0.007674 NO2 ratio mean (2021/2020): 1.09 NO2 ratio std (2021/2020): 0.01014 NO2 ratio mean (2021/2019): 1.041 NO2 ratio std (2021/2019): 0.00885
Calculate yearly mean & std per cluster
## Prepare data for background cluster
data_background = np.multiply(copy.deepcopy(no2_scd)[:, :, :], no2_ConvGistar_background)
data_background_2019 = data_background[27:(27 + 365), :, :]
data_background_2020 = data_background[(27 + 365):(27 + 365 + 366), :, :]
data_background_2021 = data_background[(27 + 365 + 366):(27 + 365 + 366 + 366), :, :]
## 2019 NO2 column density
print('NO2 mean (2019):', round_sf(np.nanmean(data_background_2019), 5), '\nNO2 std (2019):', round_sf(np.nanstd(data_background_2019), 4))
## 2020 NO2 column density
print('NO2 mean (2020):', round_sf(np.nanmean(data_background_2020), 5), '\nNO2 std (2020):', round_sf(np.nanstd(data_background_2020), 4))
## 2021 NO2 column density
print('NO2 mean (2021):', round_sf(np.nanmean(data_background_2021), 5), '\nNO2 std (2021):', round_sf(np.nanstd(data_background_2021), 4))
NO2 mean (2019): 0.000115 NO2 std (2019): 1.969e-05 NO2 mean (2020): 0.00011017 NO2 std (2020): 1.663e-05 NO2 mean (2021): 0.00012118 NO2 std (2021): 1.839e-05
## 2020/2019 ratio
ratio_background_2020_2019 = np.nanmean(data_background_2020, axis=0) / np.nanmean(data_background_2019, axis=0)
print('NO2 ratio mean (2020/2019):', round_sf(np.nanmean(ratio_background_2020_2019), 4), '\nNO2 ratio std (2020/2019):', round_sf(np.nanstd(ratio_background_2020_2019), 4))
## 2021/2020 ratio
ratio_background_2021_2020 = np.nanmean(data_background_2021, axis=0) / np.nanmean(data_background_2020, axis=0)
print('NO2 ratio mean (2021/2020):', round_sf(np.nanmean(ratio_background_2021_2020), 4), '\nNO2 ratio std (2021/2020):', round_sf(np.nanstd(ratio_background_2021_2020), 4))
## 2020/2019 ratio
ratio_background_2021_2019 = np.nanmean(data_background_2021, axis=0) / np.nanmean(data_background_2019, axis=0)
print('NO2 ratio mean (2021/2019):', round_sf(np.nanmean(ratio_background_2021_2019), 4), '\nNO2 ratio std (2021/2019):', round_sf(np.nanstd(ratio_background_2021_2019), 4))
NO2 ratio mean (2020/2019): 0.9574 NO2 ratio std (2020/2019): 0.007282 NO2 ratio mean (2021/2020): 1.099 NO2 ratio std (2021/2020): 0.008152 NO2 ratio mean (2021/2019): 1.052 NO2 ratio std (2021/2019): 0.00759